1 FORBES BILLIONAIRES EDA

1.1 Importing Libraries

pacman::p_load(
  #data wrangling
  tidyverse, stringr,
  #data visualization
  ggplot2, RColorBrewer, ggsci,
  plotly, ggpubr, vtable
)

Reading CSV file

forbes <- read_csv("2022_forbes_billionaires.csv") %>% janitor::clean_names()
## New names:
## Rows: 2600 Columns: 8
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (5): name, networth, country, source, industry dbl (3): ...1, rank, age
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
forbes
## # A tibble: 2,600 × 8
##       x1  rank name                     networth   age country    source indus…¹
##    <dbl> <dbl> <chr>                    <chr>    <dbl> <chr>      <chr>  <chr>  
##  1     0     1 Elon Musk                $219 B      50 United St… Tesla… Automo…
##  2     1     2 Jeff Bezos               $171 B      58 United St… Amazon Techno…
##  3     2     3 Bernard Arnault & family $158 B      73 France     LVMH   Fashio…
##  4     3     4 Bill Gates               $129 B      66 United St… Micro… Techno…
##  5     4     5 Warren Buffett           $118 B      91 United St… Berks… Financ…
##  6     5     6 Larry Page               $111 B      49 United St… Google Techno…
##  7     6     7 Sergey Brin              $107 B      48 United St… Google Techno…
##  8     7     8 Larry Ellison            $106 B      77 United St… softw… Techno…
##  9     8     9 Steve Ballmer            $91.4 B     66 United St… Micro… Techno…
## 10     9    10 Mukesh Ambani            $90.7 B     64 India      diver… Divers…
## # … with 2,590 more rows, and abbreviated variable name ¹​industry
#Check the content of the dataset

glimpse(forbes)
## Rows: 2,600
## Columns: 8
## $ x1       <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
## $ rank     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ name     <chr> "Elon Musk", "Jeff Bezos", "Bernard Arnault & family", "Bill …
## $ networth <chr> "$219 B", "$171 B", "$158 B", "$129 B", "$118 B", "$111 B", "…
## $ age      <dbl> 50, 58, 73, 66, 91, 49, 48, 77, 66, 64, 59, 80, 82, 68, 37, 7…
## $ country  <chr> "United States", "United States", "France", "United States", …
## $ source   <chr> "Tesla, SpaceX", "Amazon", "LVMH", "Microsoft", "Berkshire Ha…
## $ industry <chr> "Automotive", "Technology", "Fashion & Retail", "Technology",…

1.2 Data Cleaning

#Rename the columns
forbes <- tibble("Rank"=forbes$rank,
                 "Names"=forbes$name,
                 "Networth"=forbes$networth,
                 "Age"=forbes$age,
                 "Country"=forbes$country,
                 "Source"=forbes$source,
                 "Industry"=forbes$industry)
glimpse(forbes)
## Rows: 2,600
## Columns: 7
## $ Rank     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Names    <chr> "Elon Musk", "Jeff Bezos", "Bernard Arnault & family", "Bill …
## $ Networth <chr> "$219 B", "$171 B", "$158 B", "$129 B", "$118 B", "$111 B", "…
## $ Age      <dbl> 50, 58, 73, 66, 91, 49, 48, 77, 66, 64, 59, 80, 82, 68, 37, 7…
## $ Country  <chr> "United States", "United States", "France", "United States", …
## $ Source   <chr> "Tesla, SpaceX", "Amazon", "LVMH", "Microsoft", "Berkshire Ha…
## $ Industry <chr> "Automotive", "Technology", "Fashion & Retail", "Technology",…
# check out for duplicate values and null values
duplicate <- unique(forbes)
duplicate
## # A tibble: 2,600 × 7
##     Rank Names                    Networth   Age Country       Source    Indus…¹
##    <dbl> <chr>                    <chr>    <dbl> <chr>         <chr>     <chr>  
##  1     1 Elon Musk                $219 B      50 United States Tesla, S… Automo…
##  2     2 Jeff Bezos               $171 B      58 United States Amazon    Techno…
##  3     3 Bernard Arnault & family $158 B      73 France        LVMH      Fashio…
##  4     4 Bill Gates               $129 B      66 United States Microsoft Techno…
##  5     5 Warren Buffett           $118 B      91 United States Berkshir… Financ…
##  6     6 Larry Page               $111 B      49 United States Google    Techno…
##  7     7 Sergey Brin              $107 B      48 United States Google    Techno…
##  8     8 Larry Ellison            $106 B      77 United States software  Techno…
##  9     9 Steve Ballmer            $91.4 B     66 United States Microsoft Techno…
## 10    10 Mukesh Ambani            $90.7 B     64 India         diversif… Divers…
## # … with 2,590 more rows, and abbreviated variable name ¹​Industry
Null_values <- which(is.na(duplicate))
#Remove the dollar and billion sign from Networth
forbes$Networth <- as.numeric(str_replace_all(forbes$Networth, "[$B]", ""))
#check the data types of the data set
summary(forbes)
##       Rank         Names              Networth            Age        
##  Min.   :   1   Length:2600        Min.   :  1.000   Min.   : 19.00  
##  1st Qu.: 637   Class :character   1st Qu.:  1.500   1st Qu.: 55.00  
##  Median :1292   Mode  :character   Median :  2.400   Median : 64.00  
##  Mean   :1270                      Mean   :  4.861   Mean   : 64.27  
##  3rd Qu.:1929                      3rd Qu.:  4.500   3rd Qu.: 74.00  
##  Max.   :2578                      Max.   :219.000   Max.   :100.00  
##    Country             Source            Industry        
##  Length:2600        Length:2600        Length:2600       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
## 

2 Exploratory Data Analysis

statistics <- forbes[,c("Age","Networth")] %>% st(out="csv")%>%
  as.data.frame()
## Warning in st(., out = "csv"): out = "csv" will just return the vtable as a
## data.frame unless combined with file
statistics
##   Variable    N   Mean Std. Dev. Min Pctl. 25 Pctl. 75 Max
## 1      Age 2600 64.272    13.221  19       55       74 100
## 2 Networth 2600  4.861     10.66   1      1.5      4.5 219

2.1 Evauation based on their names and networth

2.2 Top 10 billionaires and their networth

Top <- forbes %>% head(10)%>% ggplot(aes(reorder(x=Names,-Networth),y=Networth,fill=Names)) + 
  geom_bar(stat="identity")+
  geom_text(aes(label=Networth),cex=2.5,vjust=0.05)+
  labs(title="Top 10 billionaires and their networth",
       x="Names",y="Networth")+ 
  theme_classic()+scale_fill_brewer(palette="Spectral")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
Top

2.3 Top 10 countries of top 10 billionaires

Countries<- forbes %>% head(10)%>%
  plot_ly(x=~Names, y=~Networth, type="scatter", 
  mode="markers",marker= list(size=~Networth),color=~Country)%>%
  layout(title="Countries of top 10 billionaires")
Countries

2.4 Industries of top 10 billionaires

Industries<- forbes %>% head(10)%>%
  plot_ly(x=~Names, y=~Networth, type="scatter", 
          mode="markers",marker= list(size=~Networth),color=~Industry)%>%
  layout(title="Industry of top 10 billionaires")
Industries

2.5 Sources of top 10 billionaires

Source<- forbes %>% head(10)%>%
  plot_ly(x=~Names, y=~Networth, type="scatter", 
          mode="markers",marker= list(size=~Networth),color=~Source)%>%
  layout(title="Sources of top 10 billionaires")
Source

3 Distribution of age in billionaires

3.1 Top 10 billionaires and their ages

bar <- forbes %>% head(10)%>% ggplot(aes(reorder(x=Names, -Age),y=Age,fill=Names)) + 
  geom_bar(stat="identity")+
  geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
  labs(title="Top 10 billionaires and their age",
       x="Names",y="Networth")+ 
  theme_classic()+scale_fill_brewer(palette="BrBG")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
bar

3.2 Top 10 old billionaires

forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Forbes_Old2 <- forbes_Old %>% head(10) %>%ggplot(aes(x=Names,y=Age,fill=Names))+ 
  geom_bar(stat="identity")+
  geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
  labs(title="Top 10 oldest billionaires",
       x="Networth",y="Age")+ 
  theme_classic()+scale_fill_brewer(palette="PuOr")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
Forbes_Old2

3.3 Top 10 youngest billionaire

forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Tail_10 <- forbes_Old %>% tail(10)%>% ggplot(aes(x=Names,y=Age,fill=Names))+
  geom_bar(stat="identity")+
  geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
  labs(title="Top 10 youngest billionaires", x="Names",y="Age")+ 
  theme_classic()+scale_fill_brewer(palette="PuOr")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
Tail_10

3.4 Networth of top billionaires

worth <- forbes %>% select(Country,Networth) %>% group_by(Country)%>% summarise(Top_billionaire=sum(Networth))%>%arrange(desc(Top_billionaire))
Top_countries <- worth %>% head(10)
options(warn = -1)
Top_countries1 <- Top_countries %>% plot_ly(x=~Country, y=~Top_billionaire, type="scatter", 
          mode="markers",size=10,color=~Country,colors="Dark2")%>%
  layout(title="Total networth of billionaires in top 10 countries",
         yaxis=list(title="Networth"))
Top_countries1

3.5 Distribution of age in billionaires

Age <- forbes%>%plot_ly(x=~Age, type="histogram", color=~Age,colors="Paired")%>%
  layout(title="Distribution of Age in billionaires",yaxis=list(title="Count"))
Age

3.6 Top 5 industries and their countries

Industries <- forbes %>% group_by(Country,Industry)%>% summarize(n=n())%>% arrange(desc=n)
## `summarise()` has grouped output by 'Country'. You can override using the
## `.groups` argument.
Industries
## # A tibble: 448 × 3
## # Groups:   Country [75]
##    Country   Industry                  n
##    <chr>     <chr>                 <int>
##  1 Algeria   Food & Beverage           1
##  2 Argentina Diversified               1
##  3 Argentina Healthcare                1
##  4 Argentina Real Estate               1
##  5 Argentina Technology                1
##  6 Australia Automotive                1
##  7 Australia Diversified               1
##  8 Australia Gambling & Casinos        1
##  9 Australia Logistics                 1
## 10 Australia Media & Entertainment     1
## # … with 438 more rows

3.7 United state networth in all Industries

United <- Industries %>% filter(Country=="United States")
fun_color_range <- colorRampPalette(c("Blue","Yellow","Green","Red"))
my_colors <- fun_color_range(18)
my_colors
##  [1] "#0000FF" "#2D2DD2" "#5A5AA4" "#878778" "#B4B44A" "#E1E11D" "#F0FF00"
##  [8] "#C3FF00" "#95FF00" "#68FF00" "#3BFF00" "#0EFF00" "#1DE100" "#4AB400"
## [15] "#778700" "#A45A00" "#D22D00" "#FF0000"
United_States <- United %>% plot_ly(x=~Industry,y=~n, type="scatter",
                                        mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
  layout(title="United States networth in all industries",
         yaxis=list(title="United States"))

United
## # A tibble: 18 × 3
## # Groups:   Country [1]
##    Country       Industry                       n
##    <chr>         <chr>                      <int>
##  1 United States Metals & Mining                2
##  2 United States Construction & Engineering     5
##  3 United States Telecom                        5
##  4 United States Gambling & Casinos             6
##  5 United States Logistics                      6
##  6 United States Diversified                   15
##  7 United States Automotive                    16
##  8 United States Service                       18
##  9 United States Sports                        24
## 10 United States Manufacturing                 25
## 11 United States Energy                        32
## 12 United States Healthcare                    32
## 13 United States Media & Entertainment         41
## 14 United States Real Estate                   46
## 15 United States Fashion & Retail              53
## 16 United States Food & Beverage               63
## 17 United States Technology                   137
## 18 United States Finance & Investments        193

3.8 China networth in all Industries

China <- Industries %>% filter(Country=="China")
China_Industries <- China%>% plot_ly(x=~Industry,y=~n, type="scatter",
                                                        mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
  layout(title="China networth in all industries",
         yaxis=list(title="China"))
China_Industries

3.9 India Networth in all industries

India <- Industries %>% filter(Country=="India")
India_Industries <- India%>% plot_ly(x=~Industry,y=~n, type="scatter",
                                     mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
  layout(title="India networth in all industries",
         yaxis=list(title="India"))
India_Industries

3.10 Germany networth in all Industries

Germany <- Industries %>% filter(Country=="Germany")
Germany_Industries <- Germany%>% plot_ly(x=~Industry,y=~n, type="scatter",
                                     mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
  layout(title="Germany networth in all industries",
         yaxis=list(title="Germany"))
Germany_Industries

3.11 France networth in all industries

France <- Industries %>% filter(Country=="France")
France_Industries <- France%>% plot_ly(x=~Industry,y=~n, type="scatter",
                                         mode="markers",size=~n,color=~Industry,colors=my_colors)%>%
  layout(title="France networth in all industries",
         yaxis=list(title="France"))
France_Industries

3.12 Analysis according to source

bar <- forbes %>% head(10)%>% ggplot(aes(reorder(x=Names, -Age),y=Age,fill=Names)) + 
  geom_bar(stat="identity")+
  geom_text(aes(label=Age),cex=2.5,vjust=0.05)+
  labs(title="Top 10 billionaires and their age",
       x="Names",y="Networth")+ 
  theme_classic()+scale_fill_brewer(palette="BrBG")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
bar

# old billionaires

forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Forbes_Old3 <- forbes_Old %>% head(10) %>%ggplot(aes(x=Names,y=Networth,fill=Source))+ 
  geom_bar(stat="identity")+
  geom_text(aes(label=Networth),cex=2.5,vjust=0.05)+
  labs(title="Top 10 oldest billionaires and their sources",
       x="Names",y="Networth")+ 
  theme_classic()+scale_fill_brewer(palette="PuOr")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
Forbes_Old3

## youngest billionaire and their sources

forbes_Old <- forbes[order(forbes$Age, decreasing = TRUE), ]
Tail_10 <- forbes_Old %>% tail(10)%>% ggplot(aes(x=Names,y=Networth,fill=Source))+
  geom_bar(stat="identity")+
  geom_text(aes(label=Networth),cex=2.5,vjust=0.05)+
  labs(title="Top 10 youngest billionaires and their sources", x="Names",y="Networth")+ 
  theme_classic()+scale_fill_brewer(palette="PuOr")+
  theme(axis.text.x=element_text(angle=45,vjust=0.5))
Tail_10

## Total networth of billionaires in top 10 sources
Sources <- forbes  %>%group_by(Source) %>% summarise(Total_Networth=sum(Networth))%>%arrange(desc(Total_Networth))
Sources
## # A tibble: 895 × 2
##    Source          Total_Networth
##    <chr>                    <dbl>
##  1 real estate               574.
##  2 diversified               382 
##  3 investments               358.
##  4 software                  290.
##  5 pharmaceuticals           284.
##  6 hedge funds               272.
##  7 Google                    261.
##  8 Walmart                   238 
##  9 Microsoft                 232.
## 10 Tesla, SpaceX             219 
## # … with 885 more rows
## Total networth of billionaires in top 10 sources
Top_Billionaire_sources<- Sources %>%head(10) %>% plot_ly(x=~Source,y=~Total_Networth, type="scatter",
                   mode="markers",size=10,color=~Source,colors=my_colors)%>%
  layout(title="Total networth of billionaires in top 10 sources",
         yaxis=list(title="Networth"))
Top_Billionaire_sources
## Total networth of billionaires in top 10 Industries
Industry <- forbes  %>%group_by(Industry) %>% summarise(Total_Networth=sum(Networth))%>%arrange(desc(Total_Networth))
Industry
## # A tibble: 18 × 2
##    Industry                   Total_Networth
##    <chr>                               <dbl>
##  1 Technology                         2168. 
##  2 Finance & Investments              1734. 
##  3 Fashion & Retail                   1613. 
##  4 Manufacturing                      1080. 
##  5 Diversified                         940. 
##  6 Food & Beverage                     933. 
##  7 Healthcare                          709. 
##  8 Real Estate                         686. 
##  9 Automotive                          583. 
## 10 Media & Entertainment               494. 
## 11 Energy                              395. 
## 12 Metals & Mining                     390. 
## 13 Telecom                             205. 
## 14 Logistics                           196. 
## 15 Service                             186. 
## 16 Construction & Engineering          121. 
## 17 Gambling & Casinos                  108. 
## 18 Sports                               97.6
## Total networth of billionaires in top 10 Industries

Top_Billionaire_Industry<- Industry %>%head(10) %>% plot_ly(x=~Industry,y=~Total_Networth, type="scatter",                     mode="markers",size=10,color=~Industry,colors=my_colors)%>%
  layout(title="Total networth of billionaires in top 10 Industries",
         yaxis=list(title="Networth"))
Top_Billionaire_Industry

#Session Info

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] vtable_1.3.4       kableExtra_1.3.4   ggpubr_0.4.0       plotly_4.10.0     
##  [5] ggsci_2.9          RColorBrewer_1.1-3 forcats_0.5.2      stringr_1.4.1     
##  [9] dplyr_1.0.10       purrr_0.3.5        readr_2.1.3        tidyr_1.2.1       
## [13] tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
## 
## loaded via a namespace (and not attached):
##  [1] fs_1.5.2            lubridate_1.8.0     bit64_4.0.5        
##  [4] insight_0.18.6      webshot_0.5.4       httr_1.4.4         
##  [7] tools_4.2.2         backports_1.4.1     bslib_0.4.0        
## [10] sjlabelled_1.2.0    utf8_1.2.2          R6_2.5.1           
## [13] DBI_1.1.3           lazyeval_0.2.2      colorspace_2.0-3   
## [16] withr_2.5.0         tidyselect_1.2.0    bit_4.0.4          
## [19] compiler_4.2.2      cli_3.4.1           rvest_1.0.3        
## [22] pacman_0.5.1        xml2_1.3.3          labeling_0.4.2     
## [25] sass_0.4.2          scales_1.2.1        systemfonts_1.0.4  
## [28] digest_0.6.29       rmarkdown_2.17      svglite_2.1.0      
## [31] pkgconfig_2.0.3     htmltools_0.5.3     highr_0.9          
## [34] dbplyr_2.2.1        fastmap_1.1.0       htmlwidgets_1.5.4  
## [37] rlang_1.0.6         readxl_1.4.1        rstudioapi_0.14    
## [40] farver_2.1.1        jquerylib_0.1.4     generics_0.1.3     
## [43] jsonlite_1.8.2      crosstalk_1.2.0     vroom_1.6.0        
## [46] car_3.1-1           googlesheets4_1.0.1 magrittr_2.0.3     
## [49] munsell_0.5.0       fansi_1.0.3         abind_1.4-5        
## [52] lifecycle_1.0.3     stringi_1.7.8       yaml_2.3.5         
## [55] snakecase_0.11.0    carData_3.0-5       grid_4.2.2         
## [58] parallel_4.2.2      crayon_1.5.2        haven_2.5.1        
## [61] hms_1.1.2           knitr_1.40          pillar_1.8.1       
## [64] ggsignif_0.6.4      reprex_2.0.2        glue_1.6.2         
## [67] evaluate_0.17       data.table_1.14.2   modelr_0.1.9       
## [70] vctrs_0.4.2         tzdb_0.3.0          cellranger_1.1.0   
## [73] gtable_0.3.1        assertthat_0.2.1    cachem_1.0.6       
## [76] xfun_0.33           janitor_2.1.0       broom_1.0.1        
## [79] rstatix_0.7.0       googledrive_2.0.0   viridisLite_0.4.1  
## [82] gargle_1.2.1        ellipsis_0.3.2